MODULE WMRasterScale;	(** AUTHOR "TF"; PURPOSE "Support scaling of images"; *)
(** AUTHOR "MZ"; PURPOSE "Speedup rasterops with SSE2"; *)

IMPORT
	SYSTEM, Raster, Rect := WMRectangles;

CONST
	(** Copy Modes *)
	ModeCopy* = 0; ModeSrcOverDst* = 1;

	(** Scale Modes *)
	ScaleBox* = 0; ScaleBilinear* = 1;

TYPE
	Rectangle = Rect.Rectangle;
	Image = Raster.Image;
	ScalerProc = PROCEDURE (src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT);
	XScalerProc = PROCEDURE (srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);

(* copy sr in 16.16 fix rectangle  from src to dr integer rectangle in dst *)
PROCEDURE Q0GenericCopy(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT);
VAR x, y : LONGINT; col : Raster.Pixel;
	getMode, putMode : Raster.Mode;
	fx, fy : LONGINT;
BEGIN
	Raster.InitMode(getMode, Raster.srcCopy);
	Raster.InitMode(putMode, Raster.srcCopy);
	fy := sy;
	FOR y := dr.t TO dr.b - 1 DO
		fx := sx;
		FOR x := dr.l TO dr.r - 1 DO
			Raster.Get(src, fx DIV 65536, fy DIV 65536, col, getMode);
			INC(fx, sdx);
			Raster.Put(dst, x, y, col, putMode)
		END;
		INC(fy, sdy)
	END
END Q0GenericCopy;

PROCEDURE Q0GenericSrcOverDst(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT);
VAR x, y : LONGINT; col : Raster.Pixel;
	getMode, putMode : Raster.Mode;
	fx, fy : LONGINT;
BEGIN
	Raster.InitMode(getMode, Raster.srcCopy);
	Raster.InitMode(putMode, Raster.srcOverDst);
	fy := sy;
	FOR y := dr.t TO dr.b - 1 DO
		fx := sx;
		FOR x := dr.l TO dr.r - 1 DO
			Raster.Get(src, fx DIV 65536, fy DIV 65536, col, getMode);
			INC(fx, sdx);
			Raster.Put(dst, x, y, col, putMode)
		END;
		INC(fy, sdy)
	END
END Q0GenericSrcOverDst;

(* copy sr in 16.16 fix rectangle  from src to dr integer rectangle in dst *)
PROCEDURE Q1GenericCopy(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; col, col0, col1, col2, col3 : Raster.Pixel;
	b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT;
	getMode, putMode : Raster.Mode;
	fx, fy : LONGINT; x0, x1, y0, y1 : LONGINT;
BEGIN
	Raster.InitMode(getMode, Raster.srcCopy);
	Raster.InitMode(putMode, Raster.srcCopy);
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dr.t TO dr.b - 1 DO
		fx := sx; y0 := Bounds(fy DIV 65536, 0, src.height - 1); y1 := Bounds(fy DIV 65536 + 1, 0, src.height - 1);
		FOR x := dr.l TO dr.r - 1 DO
			x0 := Bounds(fx DIV 65536, 0, src.width - 1); x1 := Bounds(fx DIV 65536 + 1, 0, src.width - 1);
			Raster.Get(src, x0, y0, col0, getMode);
			Raster.Get(src, x1, y0, col1, getMode);
			Raster.Get(src, x0, y1, col2, getMode);
			Raster.Get(src, x1, y1, col3, getMode);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);

			b0 := (ORD(col0[Raster.b]) * xfleft + ORD(col1[Raster.b]) * xfright) DIV 65536;
			g0 := (ORD(col0[Raster.g]) * xfleft + ORD(col1[Raster.g]) * xfright) DIV 65536;
			r0 := (ORD(col0[Raster.r]) * xfleft + ORD(col1[Raster.r]) * xfright) DIV 65536;
			a0 := (ORD(col0[Raster.a]) * xfleft + ORD(col1[Raster.a]) * xfright) DIV 65536;

			b1 := (ORD(col2[Raster.b]) * xfleft + ORD(col3[Raster.b]) * xfright) DIV 65536;
			g1 := (ORD(col2[Raster.g]) * xfleft + ORD(col3[Raster.g]) * xfright) DIV 65536;
			r1 := (ORD(col2[Raster.r]) * xfleft + ORD(col3[Raster.r]) * xfright) DIV 65536;
			a1 := (ORD(col2[Raster.a]) * xfleft + ORD(col3[Raster.a]) * xfright) DIV 65536;

			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
			cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
			cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
			ca := (a0 * yftop + a1 * yfbottom) DIV 65536;

			col[Raster.b] := CHR(cb);
			col[Raster.g] := CHR(cg);
			col[Raster.r] := CHR(cr);
			col[Raster.a] := CHR(ca);
			INC(fx, sdx);
			Raster.Put(dst, x, y, col, putMode)
		END;
		INC(fy, sdy)
	END
END Q1GenericCopy;

(* copy sr in 16.16 fix rectangle  from src to dr integer rectangle in dst *)
PROCEDURE Q1GenericSrcOverDst(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; col, col0, col1, col2, col3 : Raster.Pixel;
	b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT;
	getMode, putMode : Raster.Mode;
	fx, fy : LONGINT; x0, x1, y0, y1 : LONGINT;
BEGIN
	Raster.InitMode(getMode, Raster.srcCopy);
	Raster.InitMode(putMode, Raster.srcOverDst);
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dr.t TO dr.b - 1 DO
		fx := sx; y0 := Bounds(fy DIV 65536, 0, src.height - 1); y1 := Bounds(fy DIV 65536 + 1, 0, src.height - 1);
		FOR x := dr.l TO dr.r - 1 DO
			x0 := Bounds(fx DIV 65536, 0, src.width - 1); x1 := Bounds(fx DIV 65536 + 1, 0, src.width - 1);
			Raster.Get(src, x0, y0, col0, getMode);
			Raster.Get(src, x1, y0, col1, getMode);
			Raster.Get(src, x0, y1, col2, getMode);
			Raster.Get(src, x1, y1, col3, getMode);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);

			b0 := (ORD(col0[Raster.b]) * xfleft + ORD(col1[Raster.b]) * xfright) DIV 65536;
			g0 := (ORD(col0[Raster.g]) * xfleft + ORD(col1[Raster.g]) * xfright) DIV 65536;
			r0 := (ORD(col0[Raster.r]) * xfleft + ORD(col1[Raster.r]) * xfright) DIV 65536;
			a0 := (ORD(col0[Raster.a]) * xfleft + ORD(col1[Raster.a]) * xfright) DIV 65536;

			b1 := (ORD(col2[Raster.b]) * xfleft + ORD(col3[Raster.b]) * xfright) DIV 65536;
			g1 := (ORD(col2[Raster.g]) * xfleft + ORD(col3[Raster.g]) * xfright) DIV 65536;
			r1 := (ORD(col2[Raster.r]) * xfleft + ORD(col3[Raster.r]) * xfright) DIV 65536;
			a1 := (ORD(col2[Raster.a]) * xfleft + ORD(col3[Raster.a]) * xfright) DIV 65536;

			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
			cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
			cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
			ca := (a0 * yftop + a1 * yfbottom) DIV 65536;

			col[Raster.b] := CHR(cb);
			col[Raster.g] := CHR(cg);
			col[Raster.r] := CHR(cr);
			col[Raster.a] := CHR(ca);
			INC(fx, sdx);
			Raster.Put(dst, x, y, col, putMode)
		END;
		INC(fy, sdy)
	END
END Q1GenericSrcOverDst;

(*
PROCEDURE Q0BGR565BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y : LONGINT; yadr, adr, sa, col : LONGINT;
	fx, fy : LONGINT;
BEGIN
	fy := sy;
	yadr := dstadr + dl * 2 + dt * dstbpr;
	FOR y := dt TO db - 1 DO
		fx := sx;
		adr := yadr;
		sa := srcadr + (fy DIV 65536) * srcbpr;
		FOR x := dl TO dr - 1 DO
			col := SYSTEM.GET16(sa + (fx DIV 65536) * 2);
			INC(fx, sdx);
			SYSTEM.PUT16(adr, col);
			INC(adr, 2);
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q0BGR565BGR565;
*)

(* this asm version is 2.3 times faster than the portable version. (P3/600/Dell precision 420 (dual)) *)
PROCEDURE XQ0BGR565BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR yadr : LONGINT;
(*
CODE {SYSTEM.i386}
	MOV	EDX, dstadr[EBP]
	MOV	EBX, dl[EBP]
	SHL	EBX, 1
	ADD	EDX, EBX
	MOV	EBX, dt[EBP]
	IMUL	EBX, dstbpr[EBP]
	ADD	EDX, EBX	; edx = dstadr + 2 * dl + dt * dstbpr
	MOV	yadr[EBP], EDX
	; init first EDI
	MOV	EDI, EDX

	MOV	ECX, dt[EBP]
	SUB	db[EBP], ECX	; counter in db

	MOV	EDX, sdx[EBP]	; keep EDX

	; init first ESI
	MOV	ESI, srcadr[EBP]	; calc new source adr
	MOV	EAX, sy[EBP]
	SHR	EAX, 16	; integer part of sy
	IMUL 	EAX, srcbpr[EBP]	; sy * srcbpr
	ADD	ESI, EAX	; first source adr in ESI

outerloop:
	MOV	EBX, sx[EBP]
	MOV	ECX, dr[EBP]	; FOR x := dl TO dr - 1 DO
	SUB	ECX, dl[EBP]
innerloop:
	MOV	EAX, EBX
	SHR	EAX, 16
	MOV	AX, WORD PTR [ESI + EAX * 2]	; read the pixel
	ADD	EBX, EDX	; INC fx, sdx
	MOV	[EDI], AX	; set the pixel
	ADD	EDI, 2	; inc adr
	LOOP	innerloop

	; free : EAX, EBX, ECX
	MOV	EAX, sy[EBP]	; sy := sy + sdy
	ADD	EAX, sdy[EBP]
	MOV	sy[EBP], EAX	; keep sy in EAX

	MOV	ESI, srcadr[EBP]	; calc new source adr
	SHR	EAX, 16	; integer part of sy
	IMUL 	EAX, srcbpr[EBP]	; sy * srcbpr
	ADD	ESI, EAX	; new source adr in ESI

	; new dst address
	MOV	ECX, dstbpr[EBP]
	MOV	EAX, yadr[EBP]
	ADD	EAX, ECX
	MOV	EDI, EAX
	MOV	yadr[EBP], EAX

	DEC	db[EBP]
	JNLE	outerloop
*)
END XQ0BGR565BGR565;


PROCEDURE SSE2Q0BGR565BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT (*; VAR  mysrc, mydest, myres: ARRAY OF LONGINT*));
VAR yadr : LONGINT;
(*
CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2}
	PUSHFD
	PUSH 		EBX
	; CLI

	MOV		EDX, dstadr[EBP]
	MOV		EBX, dl[EBP]
	SHL			EBX, 1
	ADD		EDX, EBX
	MOV		EBX, dt[EBP]
	IMUL		EBX, dstbpr[EBP]
	ADD		EDX, EBX	; edx = dstadr + 2 * dl + dt * dstbpr
	MOV		yadr[EBP], EDX

	; init first EDI
	MOV		EDI, EDX

	MOV		ECX, dt[EBP]
	SUB		db[EBP], ECX	; counter in db
	JLE			endyloop
	MOV		EDX, sdx[EBP]	; keep EDX

	; init first ESI
	MOV		ESI, srcadr[EBP]	; calc new source adr
	MOV		EAX, sy[EBP]
	SHR		EAX, 16			; integer part of sy
	IMUL 		EAX, srcbpr[EBP]	; sy * srcbpr
	ADD		ESI, EAX		; first source adr in ESI

outerloop:
	MOV		EBX, sx[EBP]
	MOV		ECX, dr[EBP]	; FOR x := dl TO dr - 1 DO
	SUB		ECX, dl[EBP]
	JLE			endyloop

innerloop:
	CMP 		ECX, 8
	 JLE			singlepixel

	PXOR 		XMM0, XMM0

	; 8pixels at the time
	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,0
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,1
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,2
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,3
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,4
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,5
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,6
	ADD		EBX, EDX			; INC fx, sdx

	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2] ; read the pixel
	PINSRW  	XMM0, EAX,7
	ADD		EBX, EDX			; INC fx, sdx

	MOVDQU 	[EDI], XMM0 		;	MOV	[EDI], AX							; set the pixels
	ADD		EDI, 16				; inc adr
	SUB 		ECX, 8
	CMP 		ECX, 0
	JE			outside2
	; LOOP 	innerloop
	JMP 		innerloop

singlepixel:
	MOV		EAX, EBX
	SHR		EAX, 16
	MOV		AX, WORD PTR [ESI + EAX * 2]	; read the pixel
	ADD		EBX, EDX			; INC fx, sdx
	MOV		[EDI], AX			; set the pixel
	ADD		EDI, 2				; inc adr
	SUB 		ECX, 1
	CMP 		ECX, 0
	JE			outside2
	; LOOP 	innerloop
	JMP 		innerloop

outside2:
	; free : EAX, EBX, ECX
	MOV		EAX, sy[EBP]		; sy := sy + sdy
	ADD		EAX, sdy[EBP]
	MOV		sy[EBP], EAX		; keep sy in EAX

	MOV		ESI, srcadr[EBP]		; calc new source adr
	SHR		EAX, 16				; integer part of sy
	IMUL 		EAX, srcbpr[EBP]	; sy * srcbpr
	ADD		ESI, EAX			; new source adr in ESI

	; new dst address
	MOV		ECX, dstbpr[EBP]
	MOV		EAX, yadr[EBP]
	ADD		EAX, ECX
	MOV		EDI, EAX
	MOV		yadr[EBP], EAX

	DEC		db[EBP]
	JNLE		outerloop

endyloop:
	EMMS 							; declare FPU registers free
	POP 		EBX
	POPFD
*)
END SSE2Q0BGR565BGR565;


PROCEDURE Q1BGR565BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr: SYSTEM.ADDRESS; col0, col1, col2, col3  : LONGINT;
	b0, g0, r0, b1, g1, r1, cb, cg, cr : LONGINT;
	fx, fy, xadd1, xadd2 : LONGINT;
	yadd1, yadd2: SYSTEM.ADDRESS;
BEGIN

	yadr := dstadr + dl * 2 + dt * dstbpr;
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dt TO db - 1 DO
		fx := sx;
		adr := yadr;
		yadd1 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr;
		yadd2 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr;
		FOR x := dl TO dr - 1 DO
			xadd1 := Bounds(fx DIV 65536, 0, sw - 1) * 2;
			xadd2 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 2;
			col0 := SYSTEM.GET16(yadd1 + xadd1);
			col1 := SYSTEM.GET16(yadd1 + xadd2);
			col2 := SYSTEM.GET16(yadd2 + xadd1);
			col3 := SYSTEM.GET16(yadd2 + xadd2);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);

			b0 := ((col0 MOD 32) * 8 * xfleft + (col1 MOD 32) * 8 * xfright) DIV 65536;
			g0 := ((col0 DIV 32 MOD 64) * 4 * xfleft + (col1 DIV 32 MOD 64) * 4 * xfright) DIV 65536;
			r0 := ((col0 DIV 2048 MOD 32) * 8 * xfleft + (col1 DIV 2048 MOD 32) * 8 * xfright) DIV 65536;

			b1 := ((col2 MOD 32) * 8 * xfleft + (col3 MOD 32) * 8 * xfright) DIV 65536;
			g1 := ((col2 DIV 32 MOD 64) * 4 * xfleft + (col3 DIV 32 MOD 64) * 4 * xfright) DIV 65536;
			r1 := ((col2 DIV 2048 MOD 32) * 8 * xfleft + (col3 DIV 2048 MOD 32) * 8 * xfright) DIV 65536;


			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
			cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
			cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
			INC(fx, sdx);
			SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11));
			INC(adr, 2);
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q1BGR565BGR565;

PROCEDURE SSE2Q1BGR565BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
(*
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT;
	b0, g0, r0, b1, g1, r1, cb, cg, cr : LONGINT;
	fx, fy, yadd1, yadd2, xadd1, xadd2 : LONGINT;

CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2}
	PUSHFD
	PUSH 		EBX
	; CLI

	; create masks

	; PXOR	XMM2, XMM2
	; PXOR		XMM3, XMM3

	PXOR		XMM4, XMM4
	PXOR		XMM5, XMM5
	PXOR		XMM6, XMM6

	; PXOR		XMM7, XMM7

	; dest red -> XMM4
	; dest green -> XMM5
	; dest blue-> XMM6

	MOV	 	EAX, 0F800H
	MOV	 	EBX, 07E0H
	MOV		ECX, 01FH
	PINSRW	XMM4, EAX,0
	PINSRW	XMM5, EBX,0
	PINSRW	XMM6, ECX,0
	PINSRW	XMM4, EAX,1
	PINSRW	XMM5, EBX,1
	PINSRW	XMM6, ECX,1
	PINSRW	XMM4, EAX,2
	PINSRW	XMM5, EBX,2
	PINSRW	XMM6, ECX,2
	PINSRW	XMM4, EAX,3
	PINSRW	XMM5, EBX,3
	PINSRW	XMM6, ECX,3
;	introallq1(dstadr,dl,dt,dstbpr,sy,yadr,sx,fy);

	MOV		EDX, dstadr[EBP]
	MOV		EBX, dl[EBP]
	SHL			EBX, 1
	ADD		EDX, EBX
	MOV		EBX, dt[EBP]
	IMUL		EBX, dstbpr[EBP]
	ADD		EDX, EBX
	MOV		yadr[EBP], EDX

	MOV		EDX, sy[EBP]
	SUB		EDX, 8000H 			;edx = sy-8000H
	MOV		fy[EBP], EDX

	; sx := sx - 8000H;

	SUB		sx[EBP], 8000H ;edx = sx-8000H

	;FOR y := dt TO db - 1 DO
	MOV		ECX, db[EBP]
	SUB		ECX, dt[EBP]			; counter in y
	JLE			endyloop
	MOV		y[EBP], ECX


outerloop:
	;q1xxall(adr,fx,sw,yadd1,yadd2,yftop,yfbottom,sdx,dr,dl);
	MOV		EDX, yadr[EBP]
	MOV		EDI, EDX 				; adr in EDI
	;MOV	adr[EBP], EDX

	MOV		EDX, sx[EBP]			; keep EDX
	MOV		fx[EBP], EDX


	MOV 		EAX, fy[EBP]
	PINSRW	XMM3, EAX,0 			; prepare for top, bottom

	SAR 		EAX, 16
	CMP 		EAX, 0
	JE			zero
	JL			negativ
	MOV		EBX, sh[EBP]
	SUB		EBX, 1
	CMP		EAX, EBX
	JGE			bigger

ok:
	MOV		EBX, EAX
	ADD		EBX, 1
	JMP 		different

zero:
	MOV 		EAX, 0
	MOV		EBX, 1
	JMP 		different

negativ:
	MOV 		EAX, 0
	MOV		EBX, 0
	JMP			samepixel

bigger:
	MOV		EAX, EBX
	JMP			samepixel

different:
	MOV		ECX, srcbpr[EBP]
	MUL		EAX, ECX
	MOV		EBX, EAX
	ADD		EBX, ECX
	MOV		ECX, srcadr[EBP]
	ADD		EAX, ECX
	ADD		EBX, ECX
	JMP			endyadd

samepixel:
	MOV		ECX, srcbpr[EBP]
	MUL		EAX, ECX
	MOV		ECX, srcadr[EBP]
	ADD		EAX, ECX
	MOV		EBX, EAX

endyadd:
	MOV		yadd1[EBP], EAX
	MOV		yadd2[EBP], EBX

	; yfbottom := (fy MOD 65536);
	; yftop := (65536 - fy MOD 65536);
	PEXTRW	EDX, XMM3,0
	AND		EDX, 0FFFFH
	PINSRW 	XMM3, EDX, 1
	NEG		EDX
	ADD		EDX, 65535
	PINSRW 	XMM3, EDX, 0
	PSRLW 		XMM3, 1

	MOV		ECX, dr[EBP]
	SUB		ECX, dl[EBP]			; counter in y
	JLE			endyloop				;exit
	MOV		x[EBP], ECX

innerloop:
	MOV 		ECX, fx[EBP]

	PINSRW	XMM7, ECX,0 ; prepare for l,r

	SAR 		ECX, 16
	CMP 		ECX, 0
	JE			zerox
	JL			negativx
	MOV		EDX, sw[EBP]
	SUB		EDX, 1
	CMP		ECX, EDX
	JGE			biggerx

okx:
	MOV		EDX, ECX
	ADD		EDX, 1
	JMP			endbound2
zerox:
	MOV 		ECX, 0
	MOV		EDX, 1
	JMP 		endbound2
negativx:
	MOV 		ECX, 0
	MOV		EDX, 0
	JMP			endbound2
biggerx:
	MOV		ECX, EDX
endbound2:
	SHL			ECX, 1
	SHL			EDX, 1
endaddx:
	MOV		EAX, yadd1[EBP]
	MOV		EBX, yadd2[EBP]

	PINSRW	XMM2, [EAX+ECX], 0
	PINSRW	XMM2, [EAX+EDX], 1
	PINSRW	XMM2, [EBX+ECX], 2
	PINSRW	XMM2, [EBX+EDX], 3

	PEXTRW	EAX, XMM7,0
	AND		EAX, 0FFFFH
	PINSRW 	XMM7, EAX,1
	PINSRW	XMM7, EAX, 3
	NEG		EAX
	ADD		EAX, 65535
	PINSRW 	XMM7, EAX, 0
	PINSRW	XMM7, EAX, 2
	PSRLW 		XMM7, 1

	; calculate red
	MOVDQU	XMM0, XMM2
	PAND		XMM0, XMM4

	PSRLW  	XMM0, 8			;SRL16bit XMM0,8

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15
	PSHUFLW	XMM0, XMM0, 58H

	PMADDWD	XMM0, XMM3

	PSRLD		XMM0,7   			; XMM3 already shifted by 1
	PAND		XMM0, XMM4
	PEXTRW 	EBX, XMM0,0

	; red done

; calculate green
	MOVDQU	XMM0, XMM2
	PAND		XMM0, XMM5		;SLL 16bit XMM0, 8
	PSRLW  	XMM0, 3			;SRL16bit XMM0,24

	PMADDWD XMM0,XMM7
	PSRLD		XMM0,15     		; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H

	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,12    			; XMM3 already shifted by 1, 5 more to get correct position

	PAND		XMM0, XMM5
	PEXTRW 	EAX, XMM0,0
	OR			EBX,EAX

	; green done

	; calculate blue
	MOVDQU	XMM0, XMM2
	PAND		XMM0, XMM6
	PSLLW  		XMM0, 3			;SLL16bit XMM0,3

	PMADDWD XMM0,XMM7
	PSRLD		XMM0,15     		; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H

	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,18    			; XMM3 already shifted by 1, 11 more to get correct position

	PAND		XMM0, XMM6
	PEXTRW 	EAX, XMM0,0
	OR			EBX,EAX
	; blue done

	MOV		[EDI], BX

	MOV		ECX, fx[EBP]
	ADD		ECX, sdx[EBP]
	MOV		fx[EBP],ECX

	ADD		EDI, 2				; inc adr

	SUB		x[EBP], 1
	JNZ			innerloop

endxloop:
	MOV		EAX,fy[EBP]			; fy := fy + sdy
	ADD		EAX, sdy[EBP]
	MOV		fy[EBP], EAX

	MOV		EAX,yadr[EBP]
	ADD		EAX, dstbpr[EBP]
	MOV		EDI, EAX
	MOV		yadr[EBP], EAX

	SUB		y[EBP], 1
	JNZ			outerloop

endyloop:
	EMMS 							; declare FPU registers free
	POP 		EBX
	POPFD
*)
END SSE2Q1BGR565BGR565;

PROCEDURE Q1BGRA8888BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: SYSTEM.ADDRESS; col, col0, col1, col2, col3 : LONGINT;
	b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca, dstb, dstg, dstr : LONGINT;
	fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: SYSTEM.ADDRESS;
BEGIN
	yadr := dstadr + dl * 2 + dt * dstbpr;
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dt TO db - 1 DO
		fx := sx;
		dstadr := yadr;
		yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr;
		yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr;
		FOR x := dl TO dr - 1 DO
			(* destination color *)
			col := SYSTEM.GET16(dstadr);
			dstb := (col MOD 32) * 8; dstg := (col DIV 32 MOD 64) * 4; dstr := (col DIV 2048 MOD 32) * 8;
			xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4;
			xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4;
			col0 := SYSTEM.GET32(yadd0 + xadd0);
			col1 := SYSTEM.GET32(yadd0 + xadd1);
			col2 := SYSTEM.GET32(yadd1 + xadd0);
			col3 := SYSTEM.GET32(yadd1 + xadd1);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);
			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			ca := (a0 * yftop + a1 * yfbottom) DIV 65536;

			IF ca # 0 THEN
				b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536;
				g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536;
				r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536;

				b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536;
				g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536;
				r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536;

				cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
				cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
				cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
				IF ca # 255 THEN
					cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END;
					cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END;
					cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END
				END;
				SYSTEM.PUT16(dstadr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11))
			END;
			INC(fx, sdx);
			INC(dstadr, 2);
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q1BGRA8888BGR565;

PROCEDURE SSE2Q1BGRA8888BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh:LONGINT);
(*
	VAR x, y, z,xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT;
	b0, g0, r0, a0, a01,b1, g1, r1, a1, cb, cg, cr,cb2, cg2, cr2, ca, ca2,dstb, dstg, dstr,res : LONGINT;
	fx, fy, yadd1, yadd2, xadd1, xadd2: LONGINT;

CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2}
	PUSHFD
	PUSH 		EBX
	; CLI
	PXOR		MM3,MM3
	PXOR		MM4,MM4
	PXOR		MM5, MM5
	PXOR		MM6, MM6
	PXOR		XMM1, XMM1
	PXOR		XMM3, XMM3
	PXOR		XMM4, XMM4
	PXOR		XMM6, XMM6
	PXOR		XMM7, XMM7

	MOV		EDX, dstadr[EBP]
	MOV		EBX, dl[EBP]
	SHL			EBX, 1
	ADD		EDX, EBX
	MOV		EBX, dt[EBP]
	IMUL		EBX, dstbpr[EBP]
	ADD		EDX, EBX
	MOV		yadr[EBP], EDX

	MOV		EDX, sy[EBP]
	SUB		EDX, 8000H 			;edx = sy-8000H
	MOV	 	fy[EBP], EDX

	; sx := sx - 8000H;
	MOV		EDX, sx[EBP]
	SUB		EDX, 8000H 			;sx = sx-8000H
	MOV		sx[EBP] , EDX

	MOV		ECX, db[EBP]
	SUB		ECX, dt[EBP]			; counter in y
	JLE			endyloop				;exit
	MOV		y[EBP], ECX

outerloop:
	MOV		EDX, yadr[EBP]
	MOV		EDI, EDX ; adr in EDI

	MOV		adr[EBP], EDX

	MOV		EDX, sx[EBP]			; keep EDX
	MOV		fx[EBP], EDX

	MOV 		EAX, fy[EBP]
	MOVD		XMM3, EAX 			; prepare for top, bottom
	SAR 		EAX, 16
	CMP 		EAX, 0
	JE			zero
	JL			negativ
	MOV		EBX, sh[EBP]
	SUB		EBX, 1
	CMP		EAX, EBX
	JGE			bigger

ok:
	MOV		EBX, EAX
	ADD		EBX, 1
	JMP 		different

zero:
	MOV 		EAX, 0
	MOV		EBX, 1
	JMP 		different

negativ:
	MOV 		EAX, 0
	MOV		EBX, 0
	JMP			samepixel

bigger:
	MOV		EAX, EBX
	JMP			samepixel

different:
	MOV		ECX, srcbpr[EBP]
	MUL		EAX, ECX
	MOV		EBX, EAX
	ADD		EBX, ECX
	MOV		ECX, srcadr[EBP]
	ADD		EAX, ECX
	ADD		EBX, ECX
	JMP			endyadd

samepixel:
	MOV		ECX, srcbpr[EBP]
	MUL		EAX, ECX
	MOV		ECX, srcadr[EBP]
	ADD		EAX, ECX
	MOV		EBX, EAX

endyadd:
	MOV		yadd1[EBP], EAX
	MOV		yadd2[EBP], EBX

	; yfbottom := (fy MOD 65536);
	; yftop := (65536 - fy MOD 65536);

	MOVD		ECX, XMM3
	AND		ECX, 0FFFFH
	MOV		yfbottom[EBP],ECX
	PINSRW 	XMM3, ECX, 1

	NEG		ECX
	ADD		ECX, 65535
	MOV		yftop[EBP],ECX
	PINSRW 	XMM3, ECX, 0

	PSRLW		XMM3, 1

	MOV		ECX, dr[EBP]
	SUB		ECX, dl[EBP]			; counter in x
	JLE			endyloop				;exit
	MOV		x[EBP], ECX

innerloop:
	MOV 		ECX, x[EBP]
	; if x < 8 then do one pixel at the time
	CMP		ECX, 8
	JL 			singlepixel
	; else
	; take 8 at the time

	MOV		EBX, EDI
	AND 		EBX, 0FH
	CMP		EBX, 0
	JNE	 		singlepixel

alleightpixels:
	MOV	 	EAX, 0000000FFH
	MOVD		MM3, EAX

	; dest red -> MM4
	MOV	 	EAX, 0F800F800H
	MOVD		MM4, EAX

	; dest green -> MM5
	MOV	 	EAX, 07E007E0H
	MOVD		MM5, EAX

	; dest blue -> MM6 ; moved as MM6 is used in singlepixel
	; MOV	 EAX, 001F001FH
	; MOVD		MM6, EAX

	MOV		ECX, yfbottom[EBP]
	PINSRW 	XMM3, ECX, 1
	MOV		ECX, yftop[EBP]
	PINSRW 	XMM3, ECX, 0
	PSRLW 		XMM3,1

	PXOR		XMM5, XMM5
	PXOR 		XMM2,XMM2
	MOV		z[EBP], 4

loop03:
	; shift everything left
	MOV 		ECX, fx[EBP]
	PSLLDQ		XMM5, 4

	PINSRW	XMM7, ECX,0 ; prepare for l,r

	SAR 		ECX, 16
	CMP 		ECX, 0
	JE			zerox03
	JL			negativx03
	MOV		EDX, sw[EBP]
	SUB		EDX, 1
	CMP		ECX, EDX
	JGE			biggerx03

okx03:
	MOV		EDX, ECX
	ADD		EDX, 1
	JMP			endbound203
zerox03:
	MOV 		ECX, 0
	MOV		EDX, 1
	JMP 		endbound203

negativx03:
	MOV 		ECX, 0
	MOV		EDX, 0
	JMP			endbound203

biggerx03:
	MOV		ECX, EDX
endbound203:
	SHL			ECX, 2 					; xadd1
	SHL			EDX, 2 					; xadd2

	MOV		EAX, yadd1[EBP]
	MOV		EBX, yadd2[EBP]

	MOVD		XMM2, [EBX+EDX]
	PSLLDQ   	XMM2,4
	MOVD		XMM1, [EBX+ECX]
	POR		XMM2,XMM1
	PSLLDQ   	XMM2,4
	MOVD		XMM1, [EAX+EDX]
	POR		XMM2,XMM1
	PSLLDQ		XMM2,4
	MOVD		XMM1, [EAX+ECX]
	POR		XMM2,XMM1

	PEXTRW	EAX,XMM7,0
	AND		EAX, 0FFFFH
	PINSRW 	XMM7, EAX,1
	PINSRW	XMM7, EAX, 3 			;xfright

	NEG		AX
	ADD		EAX, 65535
	PINSRW 	XMM7, EAX, 0
	PINSRW	XMM7, EAX, 2 			;xfleft

	PSRLW 		XMM7, 1

	MOVDQU	XMM0, XMM2
	PSRLD		XMM0, 24
	PXOR		XMM1, XMM1

	MOV		ECX, 0FFH 				; ECX locked for ca

	PINSRW	XMM1, ECX,0
	PINSRW	XMM1, ECX,2
	PINSRW	XMM1, ECX,4
	PINSRW	XMM1, ECX,6

	PCMPEQW	XMM1, XMM0

	PMOVMSKB	EAX, XMM1
	CMP		EAX, 0FFFFH
	JE 			endofalpha03

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H
	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW	ECX, XMM0, 0

endofalpha03:
	; alpha done

	CMP		ECX,0
	JE			alphazero03

	SHL			ECX, 24

	; calculate red

	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 8
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0

	SHL	EBX,16
	OR	ECX,EBX

	; red done

	; calculate green
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 16
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0
	SHL 		EBX,8
	OR 			ECX,EBX

	; green done

	; calculate blue
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0,24
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0, XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0

	OR			ECX,EBX
	; blue done

	; put color in correct position
	MOVD		XMM4,ECX
	POR		XMM5, XMM4 ; results in XMM5

	; prepared source

alphazero03: ; set mask is done later
	MOV		ECX,fx[EBP]
	ADD		ECX, sdx[EBP]
	MOV		fx[EBP],ECX

	SUB 		z[EBP], 1
	JNZ 		loop03

endofloop03:
	MOV		z[EBP], 4

loop47:
	; shift everything left
	PSLLDQ		XMM6, 4

	PINSRW	XMM7, ECX,0 ; prepare for l,r

	SAR 		ECX, 16
	CMP 		ECX, 0
	JE			zerox47
	JL			negativx47
	MOV		EDX, sw[EBP]
	SUB		EDX, 1
	CMP		ECX, EDX
	JGE			biggerx47

okx47:
	MOV		EDX, ECX
	ADD		EDX, 1
	JMP			endbound247
zerox47:
	MOV 		ECX, 0
	MOV		EDX, 1
	JMP 		endbound247

negativx47:
	MOV 		ECX, 0
	MOV		EDX, 0
	JMP			endbound247

biggerx47:
	MOV		ECX, EDX
endbound247:
	SHL			ECX, 2 						; xadd1
	SHL			EDX, 2 						; xadd2

	MOV		EAX, yadd1[EBP]
	MOV		EBX, yadd2[EBP]

	MOVD		XMM2, [EBX+EDX]
	PSLLDQ   	XMM2,4
	MOVD		XMM1, [EBX+ECX]
	POR		XMM2,XMM1
	PSLLDQ   	XMM2,4
	MOVD		XMM1, [EAX+EDX]
	POR		XMM2,XMM1
	PSLLDQ		XMM2,4
	MOVD		XMM1, [EAX+ECX]
	POR		XMM2,XMM1

	PEXTRW	EAX,XMM7,0
	AND		EAX, 0FFFFH
	PINSRW 	XMM7, EAX,1
	PINSRW	XMM7, EAX, 3 				;xfright

	NEG		EAX
	ADD		EAX, 65535
	PINSRW 	XMM7, EAX, 0
	PINSRW	XMM7, EAX, 2 				;xfleft

	PSRLW 		XMM7, 1

	MOVDQU	XMM0, XMM2
	PSRLD		XMM0, 24
	PXOR		XMM1, XMM1

	MOV		ECX, 0FFH 					; ECX locked for ca

	PINSRW	XMM1, ECX,0
	PINSRW	XMM1, ECX,2
	PINSRW	XMM1, ECX,4
	PINSRW	XMM1, ECX,6

	PCMPEQW	XMM1, XMM0

	PMOVMSKB	EAX, XMM1
	CMP		EAX, 0FFFFH
	JE 			endofalpha47

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H
	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW	ECX, XMM0, 0

endofalpha47:
	; alpha done
	CMP		ECX,0
	JE			alphazero47

	SHL			ECX, 24

	; calculate red

	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 8
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0

	SHL			EBX,16
	OR			ECX,EBX

	; red done

	; calculate green
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 16
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0
	SHL 		EBX,8
	OR 			ECX,EBX

	; green done

	; calculate blue
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0,24
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 				; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   				; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0

	OR			ECX,EBX

	; blue done

	; put color in correct position
	MOVD		XMM4,ECX
	POR		XMM6, XMM4 			; results in XMM6

	; prepared source

alphazero47: ; set mask is done later
	MOV		ECX,fx[EBP]
	ADD		ECX, sdx[EBP]
	MOV		fx[EBP],ECX

	SUB		 z[EBP], 1
	JNZ 		loop47

endofloop47:
	; all sources calculated, but in reversed order
	PSHUFD 	XMM2,XMM5, 1AH
	PSHUFD 	XMM1,XMM6, 1AH

	; now sources ready for further calculation with destination
	; get alphas
	MOVQ2DQ	XMM4,  MM3
	MOVDQU 	XMM6, XMM2
	PSHUFD		XMM4, XMM4, 0
	MOVDQU 	XMM5, XMM1
	PSLLD 		XMM4, 24
	PAND 		XMM6, XMM4 			; alpha 5-8 in XMM6
	PAND 		XMM5, XMM4  			; alpha 1-4 in XMM5
	PSRLD 		XMM5, 24
	PSHUFHW 	XMM5, XMM5, 85H
	PSRLD 		XMM6, 24

	; put both alphas into 1 register
	PSHUFHW 	XMM6, XMM6, 85H
	PSHUFLW 	XMM5, XMM5, 85H
	PSHUFLW 	XMM6, XMM6, 58H
	PSHUFD		XMM5, XMM5, 0D0H  	; 0102030400000000
	PSHUFD 	XMM6, XMM6, 5CH 		; 0000000005060708
	PXOR 		XMM0,XMM0
	POR		XMM5, XMM6            	; XMM5 = alphas 0102030405060708

	PCMPEQD 	XMM0, XMM5
	PMOVMSKB EAX, XMM0
	CMP 		EAX, 0FFFFH				; all alphas = zero; TEST not possible, because only 8 bits compared
	JE      		endloop

	; mask out alpha = zero

	; fd := 255-ORD(src[a]); fd = XMM4
	; MOV 	XMM4, 00FF00FF00FF00FF00FF00FF00FF00FFH
	PXOR 		XMM4, XMM4
	MOV	 	EAX, 00FFH
	PINSRW	XMM4, EAX ,0
	PSHUFLW 	XMM4, XMM4, 0
	PSHUFD 	XMM4, XMM4, 0
	PSUBW 		XMM4, XMM5
	MOV 		EAX,1H
	PINSRW	XMM3, EAX ,0
	PSHUFLW 	XMM3, XMM3, 0
	PSHUFD 	XMM3, XMM3, 0
	PADDUSW 	XMM4, XMM3

	; new red
	; calculate red 2

	; get source

	; sred14 = src14 && (srcMask <<16)
	; srcMask << 16
	MOVQ2DQ 	XMM3, MM3
	PSHUFD 	XMM3, XMM3, 0
	MOVDQU 	XMM5, XMM1
	MOVDQU 	XMM6, XMM2
	PSLLD 		XMM3, 16

	; sred14 = src14 && (srcMask << 24)
	; src14 must be copied because it mustn't be changed
	PAND 		XMM5, XMM3 				; sred14
	PSRLD 		XMM5, 16

	; sred14s = shuffled sred14
	PSHUFHW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM3 				; sred58
	PSRLD 		XMM6, 16

	PSHUFLW 	XMM5, XMM5,85H
	PSHUFHW 	XMM6, XMM6,85H
	PSHUFD  	XMM5, XMM5,0D0H 		; sred14s
	PSHUFLW 	XMM6, XMM6,58H
	PSHUFD  	XMM6, XMM6,5CH 			; sred58s
	POR 		XMM5, XMM6 				; sred18

	; sred18255 = sred18 * 256- sred18
	MOVDQU 	XMM7, XMM5
	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 				; sred18255

	; src is now ready

	; destination
	; dest18 must be copied because it mustn't be changed
	; Load data into memory
	MOV 		EDI, adr[EBP]
	MOVDQU 	XMM3, [EDI]  				;dest 1-8
	MOVQ2DQ  XMM6, MM4
	PSHUFD 	XMM6, XMM6, 0
	MOVDQU 	XMM7, XMM3

	PAND 		XMM7, XMM6 				; dred18
	PSRLW 		XMM7, 8
	;  dred18alpha = dred18 * negalpha
	PMULLW 	XMM7, XMM4 				; dred18alpha

	; dest is prepared
	; combining dest and src

	; dred18big = sred18255 + dred18alpha

	PADDUSW 	XMM7, XMM5 ; dred18big
	; dred18f = dred18big && destMaskred128  because >> 11 and << 11 is && mask
	PAND 		XMM7, XMM6 ; dred18f

 	; dest18nr0 = dest18 && (~destMaskred128)
 	PANDN 	XMM6, XMM3  				; dest18nr0

 	; dest18nrf = dest18nr0 || dred18f
 	POR 		XMM6, XMM7

	MOVDQU 	XMM3, XMM6

	; red is calculated

	; calculate green:
	; get source

	; sgreen14 = src14 && (srcMask <<8)
	; srcMask << 8
	MOVQ2DQ 	XMM7, MM3

	PSHUFD 	XMM7, XMM7, 0
	MOVDQU 	XMM5, XMM1
	PSLLD 		XMM7, 8
	PAND 		XMM5, XMM7 				; sgreen14
	PSRLD 		XMM5, 8

	; sgreen14s = shuffled sgreen14
	PSHUFHW 	XMM5, XMM5,85H
	MOVDQU 	XMM6, XMM2
	PSHUFLW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM7 				; sgreen58
	PSRLD 		XMM6, 8
	PSHUFD  	XMM5, XMM5,0D0H 		; sgreen14s

	; sgreen58 = src58&& (srcMask << 8)
	; src58 must be copied because it mustn't be changed

	; sgreen58s = shuffled sgreen58
	PSHUFHW 	XMM6, XMM6,85H
	PSHUFLW	XMM6, XMM6,58H
	PSHUFD  	XMM6, XMM6,5CH 			; sgreen58s

	; sgreen18 = sgreen14s || sgreen58s
	POR 		XMM5, XMM6 ; sgreen18

	; sgreen18255 = sgreen18 * 256- sgreen18
	MOVDQU 	XMM7, XMM5
	MOVQ2DQ	XMM6, MM5

	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 				; sgreen18255
	PSHUFD 	XMM6, XMM6, 0

	MOVDQU 	XMM7, XMM3

	PAND 		XMM7, XMM6 ; dgreen18
	PSRLW 		XMM7,3
	;  dgreen18alpha = dgreen18 * negalpha
	PMULLW 	XMM7, XMM4 				; dgreen18alpha

	; dest is prepared
	; combining dest and src

	; dgreen18big = sgreen18255 + dgreen18alpha
	PADDUSW 	XMM7, XMM5 				; dgreen18big
	PANDN 	XMM6, XMM3  ; dest18ng0

	; dgreen18f = (dgreen18big >> 11) <<5
	PSRLW 		XMM7, 10 					; dgreen18f
	PSLLW 		XMM7, 5

 	; dest18ng0 = dest18 && (~destMaskgreen128)

 	; dest18ngf = dest18ng0 || dred18f
 	POR 		XMM6, XMM7
	MOVDQU 	XMM3, XMM6
	; green is calculated

	; calculate blue

	MOV	 	EAX, 001F001FH
	MOVD		MM6, EAX

	; get source

	; sblue14 = src14 && (srcMask)
	; srcMask
	MOVQ2DQ 	XMM7, MM3
	MOVDQU 	XMM5, XMM1

	PSHUFD 	XMM7, XMM7, 0
	MOVDQU 	XMM6, XMM2

	; sblue14 = src14 && (srcMask)
	; src14 must be copied because it mustn't be changed
	PAND 		XMM5, XMM7 				; sblue14

	; sblue14s = shuffled sblue14
	PSHUFHW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM7 ; sblue58
	PSHUFHW 	XMM6, XMM6,85H

	PSHUFLW 	XMM5, XMM5,85H
	PSHUFLW 	XMM6, XMM6,58H

	PSHUFD  	XMM5, XMM5,0D0H 		; sblue14s
	PSHUFD  	XMM6, XMM6,5CH 			; sblue58s

	POR 		XMM5, XMM6 				; sblue18

	; sblue18255 = sblue18 * 256- sblue18
	MOVDQU 	XMM7, XMM5
	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 				; sblue18255
	MOVQ2DQ	XMM6, MM6
	PSHUFD 	XMM6, XMM6, 0
	MOVDQU 	XMM7, XMM3
	PAND 		XMM7, XMM6 				; dblue18
	PSLLW 		XMM7, 3

	PMULLW 	XMM7, XMM4 				; dblue18alpha

	; dest is prepared
	; combining dest and src

	; dblue18big = sblue18255 + dblue18alpha

	PADDUSW 	XMM7, XMM5 				; dblue18big
	; dblue18f = (dblue18big >> 11)
	PANDN 	XMM6, XMM3  				; dest18nr0
 	PSRLW 		XMM7, 11 					; dblue18f

  	; dest18nr0 = dest18 && (~destMaskblue128)

 	; dest18nbf = dest18nb0 || dblue18f
 	POR 		XMM6, XMM7
	MOVDQU 	XMM3, XMM6

	; blue is calculated

	; now dest is calculated, store it
	; get 0 stuff

	MOVDQU	XMM5, [EDI]
	PAND		XMM5,XMM0
	PANDN		XMM0, XMM3
	POR		XMM0, XMM5

	MOVDQU [EDI],XMM0

endloop:
	;fx already inc  ; by sdx
	ADD 		EDI, 16
	MOV 		adr[EBP],EDI
	SUB 		x[EBP], 8
	JNZ 		innerloop 					; x>=0
	JZ 			endxloop

singlepixel: 									; original code from MMXBGRA8888Over565, adjusted to fit this procedure
	MOV 		EDI, adr[EBP]
	MOV	 	EAX, 0000000FFH
	MOVD		MM3, EAX

	; dest red -> MM4
	MOV	 	EAX, 0F800F800H
	MOVD		MM4, EAX

	; dest green -> MM5
	MOV	 	EAX, 07E007E0H
	MOVD		MM5, EAX

	; dest blue -> MM6 ; moved as MM6 is used in singlepixel
	; MOV	 EAX, 001F001FH
	; MOVD		MM6, EAX

	MOV		ECX, yfbottom[EBP]
	PINSRW 	XMM3, ECX, 1
	MOV		ECX, yftop[EBP]
	PINSRW 	XMM3, ECX, 0
	PSRLW 		XMM3,1

	MOV 		ECX, fx[EBP]

	PINSRW	XMM7, ECX,0 				; prepare for l,r

	SAR 		ECX, 16
	CMP 		ECX, 0
	JE			zerox
	JL			negativx
	MOV		EDX, sw[EBP]
	SUB		EDX, 1
	CMP		ECX, EDX
	JGE			biggerx

okx:
	MOV		EDX, ECX
	ADD		EDX, 1
	JMP			endbound2
zerox:
	MOV 		ECX, 0
	MOV		EDX, 1
	JMP 		endbound2

negativx:
	MOV 		ECX, 0
	MOV		EDX, 0
	JMP			endbound2

biggerx:
	MOV		ECX, EDX
endbound2:
	SHL			ECX, 2 						; xadd1
	SHL			EDX, 2 						; xadd2

	MOV		EAX, yadd1[EBP]
	MOV		EBX, yadd2[EBP]

	MOVD		XMM2, [EBX+EDX]
	PSLLDQ   	XMM2,4
	MOVD		XMM1, [EBX+ECX]
	POR		XMM2,XMM1
	PSLLDQ  	XMM2,4
	MOVD		XMM1, [EAX+EDX]
	POR		XMM2,XMM1
	PSLLDQ		XMM2,4
	MOVD		XMM1, [EAX+ECX]
	POR		XMM2,XMM1

	PEXTRW	EAX,XMM7,0
	AND		EAX, 0FFFFH
	PINSRW 	XMM7, EAX,1
	PINSRW	XMM7, EAX, 3 				;xfright

	NEG		EAX
	ADD		EAX, 65535
	PINSRW 	XMM7, EAX, 0
	PINSRW	XMM7, EAX, 2 				;xfleft

	PSRLW 		XMM7, 1

	MOVDQU	XMM0, XMM2
	PSRLD		XMM0, 24
	PXOR		XMM1, XMM1

	MOV		ECX, 0FFH 					; ECX locked for ca

	PINSRW	XMM1, ECX,0
	PINSRW	XMM1, ECX,2
	PINSRW	XMM1, ECX,4
	PINSRW	XMM1, ECX,6

	PCMPEQW	XMM1, XMM0

	PMOVMSKB	EAX, XMM1
	CMP		EAX, 0FFFFH
	JE 			endofalpha

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H
	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 					; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   					; XMM3 already shifted by 1
	PEXTRW	ECX, XMM0, 0

endofalpha:
	; alpha done
	CMP		ECX,0
	JE			alphazero

	; calculate red
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 8
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 					; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   					; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0
	PINSRW	XMM4, EBX, 4
	; red done

	; calculate green
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0, 16
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 					; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   					; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0
	PINSRW	XMM4, EBX, 2

	; green done

	; calculate blue
	MOVDQU	XMM0, XMM2
	PSLLD		XMM0,24
	PSRLD		XMM0, 24

	PSHUFLW	XMM0, XMM0,58H
	PSHUFHW	XMM0, XMM0,58H
	PSHUFD		XMM0,XMM0,58H

	PMADDWD XMM0,XMM7
	PSRLD		XMM0, 15 					; XMM7 already shifted by 1
	PSHUFLW	XMM0, XMM0, 58H
	PMADDWD	XMM0, XMM3
	PSRLD		XMM0,15   					; XMM3 already shifted by 1
	PEXTRW 	EBX, XMM0,0
	PINSRW	XMM4, EBX, 0

	; blue done

	; prepared source
	CMP		ECX, 0FFH   					; ECX released
	JE			alpha255

	NEG		ECX
	ADD		ECX, 0FFH
	PINSRW	XMM1, ECX, 1  				; 255-ca
	PINSRW	XMM1, ECX, 3  				; 255-ca
	PINSRW	XMM1, ECX, 5 				; 255-ca

	MOV		EAX, 0FFH
	PINSRW	XMM1, EAX, 0 				; 255
	PINSRW	XMM1, EAX, 2  				; 255
	PINSRW	XMM1, EAX, 4  				; 255

	;prepare destination
	MOV		EBX, adr[EBP]

	MOV		EBX, [EBX]

	MOV		EAX, EBX
	AND 		EAX, 01FH
	SHL			EAX,3
	PINSRW	XMM4, EAX, 1  				; dstb

	MOV		EAX, EBX
	AND 		EAX, 07E0H
	SHR		EAX, 3
	PINSRW	XMM4, EAX, 3  				; dstg

	AND 		EBX, 0F800H
	SHR		EBX,8
	PINSRW	XMM4, EBX, 5  				; dstr

	PMADDWD	XMM4, XMM1

	PSRLD		XMM4, 8
	PXOR		XMM1,XMM1
	PACKUSWB	XMM4,XMM1

	; put results into their words
	PEXTRW	EAX, XMM4, 2 				; end red
	PINSRW	XMM4,  EAX, 4

	PEXTRW	EAX, XMM4, 1 				; end green
	PINSRW	XMM4,  EAX, 2

alpha255:
	; red in XMM4,4; green in XMM4, 2; blue in XMM4,0
	;SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11))
	PEXTRW	EAX, XMM4, 0 				; end blue
	SHR		EAX,3
	AND		EAX, 001FH

	PEXTRW	EBX, XMM4, 2 				; end green
	SHL			EBX,3
	AND		EBX, 07E0H
	OR			EAX, EBX

	PEXTRW	EBX, XMM4, 4				; end red
	SHL			EBX,8
	AND		EBX, 0F800H
	OR			EAX, EBX

	MOV		EDI,adr[EBP]
	MOV		[EDI], AX

alphazero: 									; alpha = 0, no writeback
	MOV		ECX,fx[EBP]
	ADD		ECX, sdx[EBP]
	MOV		fx[EBP],ECX

	MOV		EDI,adr[EBP]
	ADD		EDI, 2						; inc adr
	MOV		adr[EBP],EDI


	SUB		x[EBP], 1
	JNZ			innerloop

endxloop:
	MOV		EAX,fy[EBP]					; fy := fy + sdy
	ADD		EAX, sdy[EBP]
	MOV		fy[EBP], EAX

	MOV		EAX,yadr[EBP]
	ADD		EAX, dstbpr[EBP]
	;MOV	EDI, EAX
	MOV		yadr[EBP], EAX

	SUB		y[EBP], 1
	JNZ			outerloop

endyloop:
	EMMS									; declare FPU registers free
	POP 		EBX
	POPFD
*)
END SSE2Q1BGRA8888BGR565;

PROCEDURE Q0BGRA8888BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y : LONGINT; yadr, adr: SYSTEM.ADDRESS; col, col0 : LONGINT;
	cb, cg, cr, ca, dstb, dstg, dstr: LONGINT; yadd: SYSTEM.ADDRESS;
	fx, fy : LONGINT;
BEGIN
	fy := sy;
	yadr := dstadr + dl * 2 + dt * dstbpr;
	FOR y := dt TO db - 1 DO
		fx := sx;
		adr := yadr;
		yadd := srcadr + (fy DIV 65536) * srcbpr;
		FOR x := dl TO dr - 1 DO
			(* destination color *)
			col := SYSTEM.GET16(adr);
			dstb := (col MOD 32) * 8; dstg := (col DIV 32 MOD 64) * 4; dstr := (col DIV 2048 MOD 32) * 8;

			col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4);

			ca := (col0 DIV 1000000H MOD 100H);
			IF ca # 0 THEN
				cb := (col0 MOD 100H);
				cg := (col0 DIV 100H MOD 100H);
				cr := (col0 DIV 10000H MOD 100H);

				IF ca # 255 THEN
					cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END;
					cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END;
					cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END
				END;
				SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11))
			END;
			INC(fx, sdx);
			INC(adr, 2)
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q0BGRA8888BGR565;

PROCEDURE Q0BGRA8888BGRA8888(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y : LONGINT; yadr, adr: SYSTEM.ADDRESS; col, col0 : LONGINT;
	cb, cg, cr, ca, dstb, dstg, dstr, dsta : LONGINT; yadd: SYSTEM.ADDRESS;
	fx, fy : LONGINT;
BEGIN
	fy := sy;
	yadr := dstadr + dl * 4 + dt * dstbpr;
	FOR y := dt TO db - 1 DO
		fx := sx;
		adr := yadr;
		yadd := srcadr + (fy DIV 65536) * srcbpr;
		FOR x := dl TO dr - 1 DO
			(* destination color *)
			col := SYSTEM.GET32(adr);
			dstb := (col MOD 100H);
			dstg := (col DIV 100H) MOD 100H;
			dstr := (col DIV 10000H) MOD 100H;
			dsta := (col DIV 1000000H) MOD 100H;

			col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4);

			ca := (col0 DIV 1000000H MOD 100H);
			IF ca # 0 THEN
				cb := (col0 MOD 100H);
				cg := (col0 DIV 100H MOD 100H);
				cr := (col0 DIV 10000H MOD 100H);

				IF ca # 255 THEN
					cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END;
					cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 255 THEN cg := 256 END;
					cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END;
					ca := (ca * 256 + (256 - ca) * dsta) DIV 256; IF ca > 256 THEN ca := 256; END;
				END;

				SYSTEM.PUT32(adr, cb + SYSTEM.LSH(cg, 8) + SYSTEM.LSH(cr, 16) + SYSTEM.LSH(ca, 24));
			END;
			INC(fx, sdx);
			INC(adr, 4)
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q0BGRA8888BGRA8888;

PROCEDURE Q0BGRA8888BGRA8888Copy(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y  : LONGINT; yadr, adr: SYSTEM.ADDRESS; col, col0 : LONGINT; yadd : SYSTEM.ADDRESS;
	fx, fy : LONGINT;
BEGIN
	fy := sy;
	yadr := dstadr + dl * 4 + dt * dstbpr;
	FOR y := dt TO db - 1 DO
		fx := sx;
		adr := yadr;
		yadd := srcadr + (fy DIV 65536) * srcbpr;
		FOR x := dl TO dr - 1 DO
			col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4);
			SYSTEM.PUT32(adr, col0);
			INC(fx, sdx);
			INC(adr, 4)
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q0BGRA8888BGRA8888Copy;

PROCEDURE Q1BGRA8888BGRA8888(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: SYSTEM.ADDRESS; col, col0, col1, col2, col3 : LONGINT;
	b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca, dstb, dstg, dstr, dsta : LONGINT;
	fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: SYSTEM.ADDRESS;
BEGIN
	yadr := dstadr + dl * 4 + dt * dstbpr;
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dt TO db - 1 DO
		fx := sx;
		dstadr := yadr;
		yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr;
		yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr;
		FOR x := dl TO dr - 1 DO
			(* destination color *)
			col := SYSTEM.GET32(dstadr);
			dstb := col MOD 100H;
			dstg := col DIV 100H MOD 100H;
			dstr := col DIV 10000H MOD 100H;
			dsta := col DIV 1000000H MOD 100H;
			xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4;
			xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4;
			col0 := SYSTEM.GET32(yadd0 + xadd0);
			col1 := SYSTEM.GET32(yadd0 + xadd1);
			col2 := SYSTEM.GET32(yadd1 + xadd0);
			col3 := SYSTEM.GET32(yadd1 + xadd1);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);
			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			ca := (a0 * yftop + a1 * yfbottom) DIV 65536;

			IF ca # 0 THEN
				b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536;
				g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536;
				r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536;
				a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536;

				b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536;
				g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536;
				r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536;
				a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536;

				cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
				cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
				cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
				ca := (a0 * yftop + a1 * yfbottom) DIV 65536;
				IF ca # 255 THEN
					cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END;
					cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END;
					cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END;
					ca := (ca * 256 + (256 - ca) * dsta) DIV 256; IF ca > 256 THEN ca := 256; END;
				END;
				SYSTEM.PUT32(dstadr, cb + SYSTEM.LSH(cg, 8) + SYSTEM.LSH(cr, 16) + SYSTEM.LSH(ca, 24));
			END;
			INC(fx, sdx);
			INC(dstadr, 4);
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q1BGRA8888BGRA8888;

PROCEDURE Q1BGRA8888BGRA8888Copy(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: SYSTEM.ADDRESS; col, col0, col1, col2, col3 : LONGINT;
	b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT;
	fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: SYSTEM.ADDRESS;
BEGIN
	yadr := dstadr + dl * 4 + dt * dstbpr;
	fy := sy - 8000H; sx := sx - 8000H;
	FOR y := dt TO db - 1 DO
		fx := sx;
		dstadr := yadr;
		yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr;
		yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr;
		FOR x := dl TO dr - 1 DO
			(* destination color *)
			xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4;
			xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4;
			col0 := SYSTEM.GET32(yadd0 + xadd0);
			col1 := SYSTEM.GET32(yadd0 + xadd1);
			col2 := SYSTEM.GET32(yadd1 + xadd0);
			col3 := SYSTEM.GET32(yadd1 + xadd1);

			xfleft := (65536 - fx MOD 65536);
			xfright := (fx MOD 65536);
			yftop := (65536 - fy MOD 65536);
			yfbottom := (fy MOD 65536);
			a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536;
			ca := (a0 * yftop + a1 * yfbottom) DIV 65536;

			IF ca # 0 THEN
				b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536;
				g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536;
				r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536;

				b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536;
				g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536;
				r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536;

				cb := (b0 * yftop + b1 * yfbottom) DIV 65536;
				cg := (g0 * yftop + g1 * yfbottom) DIV 65536;
				cr := (r0 * yftop + r1 * yfbottom) DIV 65536;
				SYSTEM.PUT32(dstadr, cb + SYSTEM.LSH(cg, 8) + SYSTEM.LSH(cr, 16) + SYSTEM.LSH(ca, 24));
			END;
			INC(fx, sdx);
			INC(dstadr, 4);
		END;
		INC(fy, sdy);
		INC(yadr, dstbpr)
	END
END Q1BGRA8888BGRA8888Copy;

PROCEDURE SSE2Q0BGRA8888BGR565(srcadr, dstadr: SYSTEM.ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT);
(*
VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT;
	cb, cg, cr, ca, dstb, dstg, dstr, yadd : LONGINT;
	fx, fy : LONGINT;
	w : LONGINT;

CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2}
	PUSHFD
	PUSH 		EBX
	; CLI

	PXOR		MM0, MM0
	PXOR		MM1, MM1
	PXOR		MM2, MM2
	PXOR		MM3, MM3
	PXOR		MM4, MM4
	PXOR		MM5, MM5
	PXOR		MM6, MM6
	PXOR		MM7, MM7
	PXOR		XMM1, XMM1
	PXOR		XMM2, XMM2
	PXOR		XMM3, XMM3
	MOV	 	EAX, 0000000FFH
	MOVD		MM3, EAX

	; dest red -> MM4
	MOV	 	EAX, 0F800F800H
	MOVD		MM4, EAX

	; dest green -> MM5
	MOV	 	EAX, 07E007E0H
	MOVD		MM5, EAX

	; dest blue -> MM6  ; moved as MM6 is used in singlepixel
	; MOV	 EAX, 001F001FH
	; MOVD		MM6, EAX

	MOV		EAX,sy[EBP]
	MOV		fy[EBP],EAX

	MOV		EDX, dstadr[EBP]
	MOV		EBX, dl[EBP]
	SHL			EBX, 1
	ADD		EDX, EBX
	MOV		EBX, dt[EBP]
	IMUL		EBX, dstbpr[EBP]
	ADD		EDX, EBX
	MOV		yadr[EBP], EDX

	MOV		ECX, db[EBP]
	SUB		ECX, dt[EBP]		; counter in y
	JLE			endyloop			;exit
	MOV		y[EBP], ECX

outerloop:
	MOV		EDX, sx[EBP]		; keep EDX
	MOV		fx[EBP], EDX

	MOV		EDI, yadr[EBP]
	MOV		adr[EBP], EDI

	MOV		ESI, srcadr[EBP]		; calc new source adr
	MOV		EAX, fy[EBP]
	SHR		EAX, 16				; integer part of sy
	IMUL 		EAX, srcbpr[EBP]	; sy * srcbpr
	ADD		ESI, EAX			; first source adr in ESI
	MOV		yadd[EBP], ESI

	MOV		ECX, dr[EBP]
	SUB		ECX, dl[EBP]		; counter in x
	JLE			endyloop			;exit
	MOV		x[EBP], ECX

innerloop:
	MOV 		ECX, x[EBP]
	; if x < 8 then do one pixel at the time
	CMP		ECX, 8
	JL 			singlepixel
	; else
	; take 8 at the time

	MOV		EBX, EDI
	AND 		EBX, 0FH
	CMP		EBX, 0
	JNE 		singlepixel

alleightpixels:
	MOV	 	EAX, 0000000FFH
	MOVD		MM3, EAX

	; dest red -> MM4
	MOV	 	EAX, 0F800F800H
	MOVD		MM4, EAX

	; dest green -> MM5
	MOV	 	EAX, 07E007E0H
	MOVD		MM5, EAX

	; dest blue -> MM6 ; moved as MM6 is used in singlepixel
	; MOV	 EAX, 001F001FH
	; MOVD		MM6, EAX

	; dest blue -> MM6
	MOV		EAX, 001F001FH
	MOVD		MM6, EAX

	; Load data from memory
	MOV		EBX, fx[EBP]
	MOV		ECX, EBX ; copy of fx
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX] ; col0 in EAX
	MOVD		XMM2,EAX

	MOV 		EDX, sdx[EBP]
	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX] ; col1 in EAX
	MOVD		XMM1,EAX
	PSLLDQ		XMM1,4
	POR		XMM2,XMM1

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX] ; col2 in EAX
	MOVD		XMM1,EAX
	PSLLDQ		XMM1,8
	POR		XMM2,XMM1

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX] ; col3 in EAX
	MOVD		XMM1,EAX
	PSLLDQ		XMM1,12
	POR		XMM2,XMM1

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX]; col4 in EAX
	MOVD		XMM1,EAX

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX]; col5 in EAX
	MOVD		XMM3,EAX
	PSLLDQ		XMM3,4
	POR		XMM1,XMM3

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX]; col6 in EAX
	MOVD		XMM3,EAX
	PSLLDQ		XMM3,8
	POR		XMM1,XMM3

	ADD		ECX, EDX
	MOV		EBX, ECX
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV		EAX,[EBX] ; col7 in EAX
	MOVD		XMM3,EAX
	PSLLDQ		XMM3,12
	POR		XMM1,XMM3

	ADD 		ECX, EDX
	MOV		fx[EBP], ECX

	; swap regs
	; MOVDQU 	XMM4, XMM2
	; MOVDQU 	XMM2, XMM1
	; MOVDQU 	XMM1, XMM4

	; get alphas
	MOVQ2DQ	XMM4,  MM3
	MOVDQU 	XMM6, XMM2
	PSHUFD		XMM4, XMM4, 0
	MOVDQU 	XMM5, XMM1
	PSLLD 		XMM4, 24
	PAND 		XMM6, XMM4 		; alpha 5-8 in XMM6
	PAND 		XMM5, XMM4  		; alpha 1-4 in XMM5
	PSRLD 		XMM5, 24
	PSHUFHW 	XMM5, XMM5, 85H
	PSRLD 		XMM6, 24

	; put both alphas into 1 register
	PSHUFHW 	XMM6, XMM6, 85H
	PSHUFLW 	XMM5, XMM5, 85H
	PSHUFLW 	XMM6, XMM6, 58H
	PSHUFD		XMM5, XMM5, 0D0H  	; 0102030400000000
	PSHUFD 	XMM6, XMM6, 5CH 		; 0000000005060708
	PXOR 		XMM0,XMM0
	POR		XMM5, XMM6            	; XMM5 = alphas 0102030405060708

	PCMPEQD 	XMM0, XMM5
	PMOVMSKB EAX, XMM0
	CMP 		EAX, 0FFFFH 			; all alphas = zero; TEST not possible, because only 8 bits compared
	JE      		endloop

	; mask out alpha = zero

	; fd := 255-ORD(src[a]); fd = XMM4
	; MOV 	XMM4, 00FF00FF00FF00FF00FF00FF00FF00FFH
	PXOR 		XMM4, XMM4
	MOV	 	EAX, 00FFH
	PINSRW	XMM4, EAX ,0
	PSHUFLW 	XMM4, XMM4, 0
	PSHUFD 	XMM4, XMM4, 0
	PSUBW 	XMM4, XMM5
	MOV 		EAX,1H
	PINSRW	XMM3, EAX ,0
	PSHUFLW 	XMM3, XMM3, 0
	PSHUFD 	XMM3, XMM3, 0
	PADDUSW 	XMM4, XMM3

	; new red
	; calculate red 2

	; get source

	; sred14 = src14 && (srcMask <<16)
	; srcMask << 16
	MOVQ2DQ 	XMM3, MM3
	PSHUFD 	XMM3, XMM3, 0
	MOVDQU 	XMM5, XMM1
	MOVDQU 	XMM6, XMM2
	PSLLD 		XMM3, 16

	; sred14 = src14 && (srcMask << 24)
	; src14 must be copied because it mustn't be changed
	PAND 		XMM5, XMM3 			; sred14
	PSRLD 		XMM5, 16

	; sred14s = shuffled sred14
	PSHUFHW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM3 			; sred58
	PSRLD 		XMM6, 16

	PSHUFLW 	XMM5, XMM5,85H
	PSHUFHW 	XMM6, XMM6,85H
	PSHUFD  	XMM5, XMM5,0D0H 	; sred14s
	PSHUFLW 	XMM6, XMM6,58H
	PSHUFD  	XMM6, XMM6,5CH 		; sred58s
	POR 		XMM5, XMM6 			; sred18

	; sred18255 = sred18 * 256- sred18
	MOVDQU 	XMM7, XMM5
	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 			; sred18255

	; src is now ready

	;destination
	; dest18 must be copied because it mustn't be changed
	; Load data into memory
	MOV 		EDI, adr[EBP]
	MOVDQU 	XMM3, [EDI]  			;dest 1-8
	MOVQ2DQ	XMM6, MM4
	PSHUFD 	XMM6, XMM6, 0
	MOVDQU 	XMM7, XMM3

	PAND 		XMM7, XMM6 			; dred18
	PSRLW 		XMM7, 8
	;  dred18alpha = dred18 * negalpha
	PMULLW 	XMM7, XMM4 			; dred18alpha

	; dest is prepared
	; combining dest and src

	; dred18big = sred18255 + dred18alpha
	 PADDUSW XMM7, XMM5 			; dred18big
	; dred18f = dred18big && destMaskred128  because >> 11 and << 11 is && mask
	PAND 		XMM7, XMM6 			; dred18f

  	; dest18nr0 = dest18 && (~destMaskred128)
 	PANDN 	XMM6, XMM3  			; dest18nr0

 	 ; dest18nrf = dest18nr0 || dred18f
 	POR 		XMM6, XMM7

	MOVDQU 	XMM3, XMM6

	; red is calculated

	; calculate green:
	; get source

	; sgreen14 = src14 && (srcMask <<8)
	; srcMask << 8
	MOVQ2DQ 	XMM7, MM3

	PSHUFD 	XMM7, XMM7, 0
	MOVDQU 	XMM5, XMM1
	PSLLD 		XMM7, 8
	PAND 		XMM5, XMM7 			; sgreen14
	PSRLD 		XMM5, 8

	; sgreen14s = shuffled sgreen14
	PSHUFHW 	XMM5, XMM5,85H
	MOVDQU 	XMM6, XMM2
	PSHUFLW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM7 			; sgreen58
	PSRLD 		XMM6, 8
	PSHUFD  	XMM5, XMM5,0D0H 	; sgreen14s

	; sgreen58 = src58&& (srcMask << 8)
	; src58 must be copied because it mustn't be changed

	; sgreen58s = shuffled sgreen58
	PSHUFHW 	XMM6, XMM6,85H
	PSHUFLW 	XMM6, XMM6,58H
	PSHUFD  	XMM6, XMM6,5CH 		; sgreen58s

	; sgreen18 = sgreen14s || sgreen58s
	POR 		XMM5, XMM6 ; sgreen18

	; sgreen18255 = sgreen18 * 256- sgreen18
	MOVDQU 	XMM7, XMM5
	MOVQ2DQ	XMM6, MM5

	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 			; sgreen18255
	PSHUFD 	XMM6, XMM6, 0

	MOVDQU 	XMM7, XMM3

	PAND 		XMM7, XMM6 			; dgreen18
	PSRLW 		XMM7,3
	;  dgreen18alpha = dgreen18 * negalpha
	PMULLW 	XMM7, XMM4 			; dgreen18alpha

	; dest is prepared
	; combining dest and src

	; dgreen18big = sgreen18255 + dgreen18alpha

	PADDUSW 	XMM7, XMM5 			; dgreen18big
	PANDN 	XMM6, XMM3  			; dest18ng0

	; dgreen18f = (dgreen18big >> 11) <<5

	PSRLW 		XMM7, 10 				; dgreen18f
	PSLLW 		XMM7, 5

  	; dest18ng0 = dest18 && (~destMaskgreen128)

 	 ; dest18ngf = dest18ng0 || dred18f
 	POR 		XMM6, XMM7
	MOVDQU 	XMM3, XMM6
	; green is calculated

	; calculate blue

	; get source

	; sblue14 = src14 && (srcMask)
	; srcMask
	MOVQ2DQ 	XMM7, MM3
	MOVDQU 	XMM5, XMM1

	PSHUFD 	XMM7, XMM7, 0
	MOVDQU 	XMM6, XMM2

	; sblue14 = src14 && (srcMask)
	; src14 must be copied because it mustn't be changed
	PAND 		XMM5, XMM7 			; sblue14

	; sblue14s = shuffled sblue14
	PSHUFHW 	XMM5, XMM5,85H
	PAND 		XMM6, XMM7 			; sblue58
	PSHUFHW 	XMM6, XMM6,85H

	PSHUFLW 	XMM5, XMM5,85H
	PSHUFLW 	XMM6, XMM6,58H

	PSHUFD  	XMM5, XMM5,0D0H 	; sblue14s
	PSHUFD  	XMM6, XMM6,5CH 		; sblue58s

	POR 		XMM5, XMM6 			; sblue18

	; sblue18255 = sblue18 * 256- sblue18
	MOVDQU 	XMM7, XMM5
	PSLLW 		XMM5, 8
	PSUBUSW 	XMM5, XMM7 			; sblue18255
	MOVQ2DQ 	XMM6, MM6
	PSHUFD 	XMM6, XMM6, 0
	MOVDQU 	XMM7, XMM3
	PAND 		XMM7, XMM6 			; dblue18
	PSLLW 		XMM7, 3

	PMULLW 	XMM7, XMM4 			; dblue18alpha

	; dest is prepared
	; combining dest and src

	; dblue18big = sblue18255 + dblue18alpha
	 PADDUSW 	XMM7, XMM5 			; dblue18big
	; dblue18f = (dblue18big >> 11)
	PANDN 	XMM6, XMM3  			; dest18nr0
 	PSRLW 		XMM7, 11 				; dblue18f

 	; dest18nr0 = dest18 && (~destMaskblue128)

  	; dest18nbf = dest18nb0 || dblue18f
 	POR 		XMM6, XMM7
	MOVDQU 	XMM3, XMM6

	; blue is calculated

	; now dest is calculated, store it
	; get 0 stuff
	MOVDQU	XMM5, [EDI]
	PAND		XMM5,XMM0
	PANDN		XMM0, XMM3
	POR		XMM0, XMM5

	MOVDQU 	[EDI],XMM0

endloop:
	;fx already inc  ; by sdx
	ADD 		EDI, 16
	MOV 		adr[EBP],EDI
	SUB 		x[EBP], 8
	JNZ 		innerloop 				; x>=0
	JZ 			endxloop

singlepixel: ; original code from MMXBGRA8888Over565, adjusted to fit this procedure
	MOV 		EDI, adr[EBP]

	MOV	 	EAX, 0000000FFH
	MOVD		MM3, EAX

	; dest red -> MM4
	MOV	 	EAX, 0F800F800H
	MOVD		MM4, EAX

	; dest green -> MM5
	MOV	 	EAX, 07E007E0H
	MOVD		MM5, EAX

	; dest blue -> MM6 				; moved as MM6 is used in singlepixel
	; MOV	 	EAX, 001F001FH
	; MOVD		MM6, EAX

	MOV	 	EAX, 0FFFFFFFFH
	MOVD		MM7, EAX
	PUNPCKLBW	MM7, MM0 		 	; 00FF00FF00FF00FF

	MOV		EBX, fx[EBP]
	SHR		EBX,16
	SHL			EBX, 2
	ADD		EBX, yadd[EBP]

	MOV 		EAX,[EBX]
	XOR		EBX, EBX
	MOV 		BX,	[EDI]

	; 255 - alpha
	MOV		EDX, EAX
	SHR		EDX, 24

	CMP		EDX, 0
	JE			empty
	CMP		EDX, 255
	JE			full

alpha:
	NEG		EDX
	ADD		EDX, 255

	MOVD 		MM6, EDX
	PUNPCKLWD MM6, MM6
	PUNPCKLDQ MM6, MM6

	MOVD 		MM1, EAX
	; unpack dst
	MOV		EDX, EBX ; b

	SHL			EDX, 3

	AND		EDX, 0F8H
	MOV		EAX, EDX

	MOV		EDX, EBX ; g
	SHL			EDX, 5
	AND		EDX, 0FC00H
	OR			EAX, EDX

	MOV		EDX, EBX ; r
	SHL			EDX, 8
	AND		EDX, 0F80000H
	OR			EAX, EDX

	MOVD		MM2, EAX
	PUNPCKLBW	MM1, MM0  		; 0000ARGB --> 0A0R0G0B
	PMULLW 	MM1, MM7
	PUNPCKLBW	MM2, MM0  		; 0000ARGB --> 0A0R0G0B
	PMULLW 	MM2, MM6
	PADDUSW 	MM1, MM2

	;	PSRLW	MM1, 8 ; normalize
	DB 			0FH, 71H, 0D1H, 08H
	PACKUSWB 	MM1, MM0

	; HUGA BIMBO Muell
	MOVD		EAX, MM1

full:
	MOV		EBX, EAX
	AND		EBX, 0FFH
	SHR		EBX, 3
	MOV		EDX, EBX

	MOV		EBX, EAX
	SHR		EBX, 8
	AND		EBX, 0FFH
	SHR		EBX, 2
	SHL			EBX, 5
	OR			EDX, EBX

	MOV		EBX, EAX
	SHR		EBX, 16
	AND		EBX, 0FFH
	SHR		EBX, 3
	SHL			EBX, 11
	OR			EDX, EBX

	MOV 		[EDI], DX

empty:
	MOV		ECX,fx[EBP]
	ADD		ECX, sdx[EBP]
	MOV		fx[EBP],ECX

	MOV		EDI,adr[EBP]
	ADD		EDI, 2					; inc adr
	MOV		adr[EBP],EDI

	SUB		x[EBP], 1
	JNZ			innerloop

endxloop:
	MOV		EAX,fy[EBP]				; fy := fy + sdy
	ADD		EAX, sdy[EBP]
	MOV		fy[EBP], EAX

	MOV		EAX, yadr[EBP]
	ADD		EAX, dstbpr[EBP]
	MOV		EDI, EAX
	MOV		yadr[EBP], EAX

	SUB		y[EBP], 1
	JNZ			outerloop

endyloop:
	EMMS 								; declare FPU registers free
	POP 		EBX
	POPFD
*)
END SSE2Q0BGRA8888BGR565;


PROCEDURE Scale*(src : Image; sr : Rectangle; dst : Image; dr : Rectangle; clip : Rectangle; copyMode, scaleMode : LONGINT);
VAR dw, dh, sw, sh : LONGINT;
	fw, fh : LONGREAL; sx, sy : LONGINT;
	scaler : ScalerProc; xscaler : XScalerProc;
	mode : Raster.Mode;
	SSE2enabled : BOOLEAN;
BEGIN
	ASSERT((clip.l >= 0) & (clip.t >= 0) & (clip.r <= dst.width) & (clip.b <= dst.height));
	ASSERT((sr.l >= 0) & (sr.t >= 0) & (sr.r <= src.width) & (sr.b <= src.height));
	dw := dr.r - dr.l; dh := dr.b - dr.t;
	sw := sr.r - sr.l; sh := sr.b - sr.t;

	IF (sw = dw) & (sh = dh) THEN (* optimize special case *)
		IF ~Rect.IsContained(clip, dr) THEN
			IF dr.l < clip.l THEN DEC(dw, (clip.l - dr.l)); INC(sr.l, (clip.l - dr.l)); dr.l := clip.l END;
			IF dr.t < clip.t THEN DEC(dh, (clip.t - dr.t)); INC(sr.t, (clip.t - dr.t)); dr.t := clip.t END;
			IF dr.r > clip.r THEN DEC(dw, (dr.r - clip.r)) END;
			IF dr.b > clip.b THEN DEC(dh, (dr.b - clip.b)) END;
		END;
		IF (dw > 0) & (dh > 0) THEN
			IF copyMode = ModeCopy THEN Raster.InitMode(mode, Raster.srcCopy)
			ELSE Raster.InitMode(mode, Raster.srcOverDst)
			END;
			Raster.Copy(src, dst, sr.l, sr.t, sr.l + dw, sr.t + dh, dr.l, dr.t, mode)
		END;
		RETURN
	END;

	fw := sw / dw;
	fh := sh / dh;
	sx := sr.l * 65536;
	sy := sr.t * 65536;
	(* clipping *)
	IF ~Rect.IsContained(clip, dr) THEN
		sw := sr.r - sr.l; sh := sr.b - sr.t;
		dw := dr.r - dr.l; dh := dr.b - dr.t;
		IF dr.r > clip.r THEN dr.r := clip.r END;
		IF dr.b > clip.b THEN dr.b := clip.b END;
		IF dr.l < clip.l THEN sx := ENTIER(65536 * (sr.l +  sw * (clip.l - dr.l) / dw)); dr.l := clip.l END;
		IF dr.t < clip.t THEN sy := ENTIER(65536 * (sr.t + sh * (clip.t - dr.t) / dh)); dr.t := clip.t END;
	END;
	IF Rect.RectEmpty(dr) THEN RETURN END;
	xscaler := NIL;
	SSE2enabled :=Raster.SSE2enabled; (*Machine.SSE2Support; *)
SSE2enabled := FALSE;
	IF SSE2enabled THEN
		IF (src.fmt.code = Raster.bgr565) & (dst.fmt.code = Raster.bgr565) THEN
			IF copyMode = ModeCopy THEN
				IF scaleMode = 0 THEN xscaler := SSE2Q0BGR565BGR565;
				ELSIF scaleMode = 1 THEN xscaler:= SSE2Q1BGR565BGR565;
				END;
			END;
		ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgr565) THEN
			IF copyMode = ModeSrcOverDst THEN
				IF scaleMode = 0 THEN xscaler :=  SSE2Q0BGRA8888BGR565;
				ELSIF scaleMode = 1 THEN xscaler := SSE2Q1BGRA8888BGR565;
				END;
			END;
		END;
	END;
	IF (xscaler = NIL) THEN
		IF (src.fmt.code = Raster.bgr565) & (dst.fmt.code = Raster.bgr565) THEN
			IF copyMode = ModeCopy THEN
				IF scaleMode = 0 THEN xscaler := XQ0BGR565BGR565;
				ELSIF scaleMode = 1 THEN xscaler := Q1BGR565BGR565;
		 		END;
			END;
		ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgr565) THEN
			IF copyMode = ModeSrcOverDst THEN
				IF scaleMode = 0 THEN xscaler := Q0BGRA8888BGR565;
				ELSIF scaleMode = 1 THEN xscaler := Q1BGRA8888BGR565;
				END;
			END;
		ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgra8888) THEN
			IF (copyMode = ModeSrcOverDst) THEN
				IF (scaleMode = 0) THEN xscaler := Q0BGRA8888BGRA8888;
				ELSIF (scaleMode = 1) THEN xscaler := Q1BGRA8888BGRA8888;
				END;
			ELSIF (copyMode = ModeCopy) THEN
				IF (scaleMode = 0) THEN xscaler := Q0BGRA8888BGRA8888Copy;
				ELSIF (scaleMode = 1) THEN xscaler := Q1BGRA8888BGRA8888Copy;
				END;
			END;
		END;
	END;

	IF xscaler # NIL THEN
		xscaler(src.adr, dst.adr, src.bpr, dst.bpr, dr.l, dr.t, dr.r, dr.b, sx, sy,
			ENTIER(fw * 65536), ENTIER(fh * 65536), src.width, src.height)
	ELSE
		scaler := Q0GenericSrcOverDst; (* fallback case *)
		IF copyMode = ModeCopy THEN
			IF scaleMode = 0 THEN scaler := Q0GenericCopy
			ELSIF scaleMode = 1 THEN scaler := Q1GenericCopy
			END
		ELSIF copyMode = ModeSrcOverDst THEN
			IF scaleMode = 0 THEN scaler := Q0GenericSrcOverDst
			ELSIF scaleMode = 1 THEN scaler := Q1GenericSrcOverDst
			END;
		END;
		scaler(src, dst, dr, sx, sy, ENTIER(fw * 65536), ENTIER(fh * 65536));
	END;
END Scale;

PROCEDURE Bounds(val, min, max : LONGINT) : LONGINT;
BEGIN
	IF val < min THEN RETURN min ELSIF val > max THEN RETURN max ELSE RETURN val END
END Bounds;

END WMRasterScale.


SpeedTest.Mod